import tensorflow as tf
import numpy as np
gpus = tf.config.experimental.list_physical_devices('GPU')
if gpus:
try:
# Restrict TensorFlow to only use the first GPU
tf.config.experimental.set_visible_devices(gpus[1], 'GPU')
# Currently, memory growth needs to be the same across GPUs
for gpu in gpus:
tf.config.experimental.set_memory_growth(gpu, True)
logical_gpus = tf.config.experimental.list_logical_devices('GPU')
print(len(gpus), "Physical GPUs,", len(logical_gpus), "Logical GPUs")
except RuntimeError as e:
# Memory growth must be set before GPUs have been initialized
print(e)
3 Physical GPUs, 1 Logical GPUs
import os
os.environ["SDL_VIDEODRIVER"] = "dummy" # this line make pop-out window not appear
from ple.games.flappybird import FlappyBird
from ple import PLE
game = FlappyBird()
env = PLE(game, fps=30, display_screen=False) # environment interface to game
env.reset_game()
pygame 1.9.6 Hello from the pygame community. https://www.pygame.org/contribute.html couldn't import doomish Couldn't import doom
class Agent:
def __init__(self, name, num_action, discount_factor=0.99):
self.exploring_rate = 0.1
self.discount_factor = discount_factor
self.num_action = num_action
self.model = self.build_model(name)
def build_model(self, name):
# input: state
# output: each action's Q-value
state_input = tf.keras.Input(shape=[8], dtype=tf.float32)
x = tf.keras.layers.Dense(units=512)(state_input)
x = tf.keras.layers.ReLU()(x)
x = tf.keras.layers.Dense(units=512)(x)
x = tf.keras.layers.ReLU()(x)
x = tf.keras.layers.Dense(units=512)(x)
x = tf.keras.layers.ReLU()(x)
x = tf.keras.layers.Dense(units=512)(x)
x = tf.keras.layers.ReLU()(x)
Q = tf.keras.layers.Dense(self.num_action)(x)
model = tf.keras.Model(name=name, inputs=state_input, outputs=Q)
return model
def loss(self, state, action, reward, tar_Q, ternimal):
# Q(s,a,theta) for all a, shape (batch_size, num_action)
output = self.model(state)
index = tf.stack([tf.range(tf.shape(action)[0]), action], axis=1)
# Q(s,a,theta) for selected a, shape (batch_size, 1)
Q = tf.gather_nd(output, index)
# set tar_Q as 0 if reaching terminal state
tar_Q *= ~np.array(terminal)
# loss = E[r+max(Q(s',a',theta'))-Q(s,a,theta)]
loss = tf.reduce_mean(tf.square(reward + self.discount_factor * tar_Q - Q))
return loss
def max_Q(self, state):
# Q(s,a,theta) for all a, shape (batch_size, num_action)
output = self.model(state)
# max(Q(s',a',theta')), shape (batch_size, 1)
return tf.reduce_max(output, axis=1)
def select_action(self, state):
# epsilon-greedy
if np.random.rand() < self.exploring_rate:
action = np.random.choice(self.num_action) # Select a random action
else:
state = np.expand_dims(state, axis = 0)
# Q(s,a,theta) for all a, shape (batch_size, num_action)
output = self.model(state)
# select action with highest action-value
action = tf.argmax(output, axis=1)[0]
return action
def update_parameters(self, episode):
self.exploring_rate = max(MIN_EXPLORING_RATE, min(0.5, 0.99**((episode) / 30)))
def shutdown_explore(self):
# make action selection greedy
self.exploring_rate = 0
# init agent
num_action = len(env.getActionSet())
# agent for frequently updating
online_agent = Agent('online', num_action)
# agent for slow updating
target_agent = Agent('target', num_action)
# synchronize target model's weight with online model's weight
target_agent.model.set_weights(online_agent.model.get_weights())
optimizer = tf.keras.optimizers.Adam(learning_rate=1e-5)
average_loss = tf.keras.metrics.Mean(name='loss')
@tf.function
def train_step(state, action, reward, next_state, ternimal):
# Delayed Target Network
tar_Q = target_agent.max_Q(next_state)
with tf.GradientTape() as tape:
loss = online_agent.loss(state, action, reward, tar_Q, ternimal)
gradients = tape.gradient(loss, online_agent.model.trainable_variables)
optimizer.apply_gradients(zip(gradients, online_agent.model.trainable_variables))
average_loss.update_state(loss)
class Replay_buffer():
def __init__(self, buffer_size=50000):
self.experiences = []
self.buffer_size = buffer_size
def add(self, experience):
if len(self.experiences) >= self.buffer_size:
self.experiences.pop(0)
self.experiences.append(experience)
def sample(self, size):
"""
sample experience from buffer
"""
if size > len(self.experiences):
experiences_idx = np.random.choice(len(self.experiences), size=size)
else:
experiences_idx = np.random.choice(len(self.experiences), size=size, replace=False)
# from all sampled experiences, extract a tuple of (s,a,r,s')
states = []
actions = []
rewards = []
states_prime = []
terminal = []
for i in range(size):
states.append(self.experiences[experiences_idx[i]][0])
actions.append(self.experiences[experiences_idx[i]][1])
rewards.append(self.experiences[experiences_idx[i]][2])
states_prime.append(self.experiences[experiences_idx[i]][3])
terminal.append(self.experiences[experiences_idx[i]][4])
return states, actions, rewards, states_prime, terminal
# init buffer
buffer = Replay_buffer()
import moviepy.editor as mpy
def make_anim(images, fps=60, true_image=False):
duration = len(images) / fps
def make_frame(t):
try:
x = images[int(len(images) / duration * t)]
except:
x = images[-1]
if true_image:
return x.astype(np.uint8)
else:
return ((x + 1) / 2 * 255).astype(np.uint8)
clip = mpy.VideoClip(make_frame, duration=duration)
clip.fps = fps
return clip
import copy
from collections import defaultdict
bucket_range_per_feature = {
'next_next_pipe_bottom_y': 40,
'next_next_pipe_dist_to_player': 512,
'next_next_pipe_top_y': 40,
'next_pipe_bottom_y': 20,
'next_pipe_dist_to_player': 20,
'next_pipe_top_y': 20,
'player_vel': 4,
'player_y': 16
}
def preprocess_state(state):
# instead of using absolute position of pipe, use relative position
state = copy.deepcopy(state)
state['next_next_pipe_bottom_y'] -= state['player_y']
state['next_next_pipe_top_y'] -= state['player_y']
state['next_pipe_bottom_y'] -= state['player_y']
state['next_pipe_top_y'] -= state['player_y']
state_key = [k for k, v in sorted(state.items())]
# do bucketing to decrease state space to speed up training
state_idx = []
for key in state_key:
state_idx.append(
int(state[key] / bucket_range_per_feature[key]))
return np.array(state_idx)
from IPython.display import Image, display
# For Epsilon-greedy
MIN_EXPLORING_RATE = 0.01
update_every_iteration = 1000
print_every_episode = 500
save_video_every_episode = 500
NUM_EPISODE = 4000
NUM_EXPLORE = 20
BATCH_SIZE = 32
iter_num = 0
for episode in range(0, NUM_EPISODE + 1):
# Reset the environment
env.reset_game()
# record frame
if episode % save_video_every_episode == 0:
frames = [env.getScreenRGB()]
# get input state
state = preprocess_state(game.getGameState())
# for every 500 episodes, shutdown exploration to see the performance of greedy action
if episode % print_every_episode == 0:
online_agent.shutdown_explore()
# cumulate reward for this episode
cum_reward = 0
t = 0
while not env.game_over():
# feed current state and select an action
action = online_agent.select_action(state)
# execute the action and get reward
reward = env.act(env.getActionSet()[action])
# record frame
if episode % save_video_every_episode == 0:
frames.append(env.getScreenRGB())
# cumulate reward
cum_reward += reward
# observe the result
state_prime = preprocess_state(game.getGameState()) # get next state
# append experience for this episode
if episode % print_every_episode != 0:
buffer.add((state, action, reward, state_prime, env.game_over()))
# Setting up for the next iteration
state = state_prime
t += 1
# update agent
if episode > NUM_EXPLORE and episode % print_every_episode != 0:
iter_num += 1
train_states, train_actions, train_rewards, train_states_prime, terminal = buffer.sample(BATCH_SIZE)
train_states = np.asarray(train_states).reshape(-1, 8)
train_states_prime = np.asarray(train_states_prime).reshape(-1, 8)
# convert Python object to Tensor to prevent graph re-tracing
train_states = tf.convert_to_tensor(train_states, tf.float32)
#print(train_states.shape)
train_actions = tf.convert_to_tensor(train_actions, tf.int32)
train_rewards = tf.convert_to_tensor(train_rewards, tf.float32)
train_states_prime = tf.convert_to_tensor(train_states_prime, tf.float32)
terminal = tf.convert_to_tensor(terminal, tf.bool)
train_step(train_states, train_actions, train_rewards, train_states_prime, terminal)
# synchronize target model's weight with online model's weight every 1000 iterations
if iter_num % update_every_iteration == 0 and episode > NUM_EXPLORE and episode % print_every_episode != 0:
target_agent.model.set_weights(online_agent.model.get_weights())
# update exploring rate
online_agent.update_parameters(episode)
target_agent.update_parameters(episode)
if episode % print_every_episode == 0 and episode > NUM_EXPLORE:
print(
"[{}] time live:{}, cumulated reward: {}, exploring rate: {}, average loss: {}".
format(episode, t, cum_reward, online_agent.exploring_rate, average_loss.result()))
average_loss.reset_states()
if episode % save_video_every_episode == 0: # for every 500 episode, record an animation
clip = make_anim(frames, fps=60, true_image=True).rotate(-90)
clip.write_videofile("movie_f/DQN_demo-{}.webm".format(episode), fps=60)
display(clip.ipython_display(fps=60, autoplay=1, loop=1, maxduration=240))
t: 32%|███▏ | 20/63 [00:00<00:00, 197.09it/s, now=None]
Moviepy - Building video movie_f/DQN_demo-0.webm. Moviepy - Writing video movie_f/DQN_demo-0.webm
t: 49%|████▉ | 31/63 [00:00<00:00, 309.65it/s, now=None]
Moviepy - Done ! Moviepy - video ready movie_f/DQN_demo-0.webm Moviepy - Building video __temp__.mp4. Moviepy - Writing video __temp__.mp4
Moviepy - Done ! Moviepy - video ready __temp__.mp4
t: 18%|█▊ | 24/135 [00:00<00:00, 234.43it/s, now=None]
[500] time live:134, cumulated reward: -3.0, exploring rate: 0.5, average loss: 0.7214372158050537 Moviepy - Building video movie_f/DQN_demo-500.webm. Moviepy - Writing video movie_f/DQN_demo-500.webm
t: 30%|███ | 41/135 [00:00<00:00, 404.36it/s, now=None]
Moviepy - Done ! Moviepy - video ready movie_f/DQN_demo-500.webm Moviepy - Building video __temp__.mp4. Moviepy - Writing video __temp__.mp4
Moviepy - Done ! Moviepy - video ready __temp__.mp4
t: 20%|█▉ | 20/102 [00:00<00:00, 195.95it/s, now=None]
[1000] time live:101, cumulated reward: -4.0, exploring rate: 0.5, average loss: 1.1970679759979248 Moviepy - Building video movie_f/DQN_demo-1000.webm. Moviepy - Writing video movie_f/DQN_demo-1000.webm
t: 38%|███▊ | 39/102 [00:00<00:00, 383.10it/s, now=None]
Moviepy - Done ! Moviepy - video ready movie_f/DQN_demo-1000.webm Moviepy - Building video __temp__.mp4. Moviepy - Writing video __temp__.mp4
Moviepy - Done ! Moviepy - video ready __temp__.mp4
t: 1%| | 23/2282 [00:00<00:10, 224.98it/s, now=None]
[1500] time live:2281, cumulated reward: 54.0, exploring rate: 0.5, average loss: 0.92523592710495 Moviepy - Building video movie_f/DQN_demo-1500.webm. Moviepy - Writing video movie_f/DQN_demo-1500.webm
t: 2%|▏ | 39/2282 [00:00<00:05, 389.81it/s, now=None]
Moviepy - Done ! Moviepy - video ready movie_f/DQN_demo-1500.webm Moviepy - Building video __temp__.mp4. Moviepy - Writing video __temp__.mp4
Moviepy - Done ! Moviepy - video ready __temp__.mp4
t: 2%|▏ | 23/1193 [00:00<00:05, 221.71it/s, now=None]
[2000] time live:1192, cumulated reward: 25.0, exploring rate: 0.5, average loss: 0.8732196688652039 Moviepy - Building video movie_f/DQN_demo-2000.webm. Moviepy - Writing video movie_f/DQN_demo-2000.webm
t: 3%|▎ | 41/1193 [00:00<00:02, 405.90it/s, now=None]
Moviepy - Done ! Moviepy - video ready movie_f/DQN_demo-2000.webm Moviepy - Building video __temp__.mp4. Moviepy - Writing video __temp__.mp4
Moviepy - Done ! Moviepy - video ready __temp__.mp4
[2500] time live:22508, cumulated reward: 591.0, exploring rate: 0.43277903725889943, average loss: 0.973191499710083 Moviepy - Building video movie_f/DQN_demo-2500.webm. Moviepy - Writing video movie_f/DQN_demo-2500.webm
Moviepy - Done ! Moviepy - video ready movie_f/DQN_demo-2500.webm Moviepy - Building video __temp__.mp4. Moviepy - Writing video __temp__.mp4
Moviepy - Done ! Moviepy - video ready __temp__.mp4
--------------------------------------------------------------------------- ValueError Traceback (most recent call last) <ipython-input-11-09e00ff4dfae> in <module> 91 clip = make_anim(frames, fps=60, true_image=True).rotate(-90) 92 clip.write_videofile("movie_f/DQN_demo-{}.webm".format(episode), fps=60) ---> 93 display(clip.ipython_display(fps=60, autoplay=1, loop=1, maxduration=240)) ~/anaconda3/envs/game/lib/python3.6/site-packages/moviepy/video/io/html_tools.py in ipython_display(clip, filetype, maxduration, t, fps, rd_kwargs, center, **html_kwargs) 219 220 return HTML2(html_embed(clip, filetype=filetype, maxduration=maxduration, --> 221 center=center, rd_kwargs=rd_kwargs, **html_kwargs)) ~/anaconda3/envs/game/lib/python3.6/site-packages/moviepy/video/io/html_tools.py in html_embed(clip, filetype, maxduration, rd_kwargs, center, **html_kwargs) 106 107 return html_embed(filename, maxduration=maxduration, rd_kwargs=rd_kwargs, --> 108 center=center, **html_kwargs) 109 110 filename = clip ~/anaconda3/envs/game/lib/python3.6/site-packages/moviepy/video/io/html_tools.py in html_embed(clip, filetype, maxduration, rd_kwargs, center, **html_kwargs) 141 if duration > maxduration: 142 raise ValueError("The duration of video %s (%.1f) exceeds the 'maxduration' "%(filename, duration)+ --> 143 "attribute. You can increase 'maxduration', by passing 'maxduration' parameter" 144 "to ipython_display function." 145 "But note that embedding large videos may take all the memory away !") ValueError: The duration of video __temp__.mp4 (375.1) exceeds the 'maxduration' attribute. You can increase 'maxduration', by passing 'maxduration' parameterto ipython_display function.But note that embedding large videos may take all the memory away !
from moviepy.editor import *
clip = VideoFileClip("movie_f/DQN_demo-2500.webm")
display(clip.ipython_display(fps=60, autoplay=1, loop=1, maxduration=1200))
Moviepy - Building video __temp__.mp4. Moviepy - Writing video __temp__.mp4
Moviepy - Done ! Moviepy - video ready __temp__.mp4